//do file for prelimnary data setup and merges 

	*Created: 9-14-21 
	*Last modified: 10-3-21
	
*Directories
	if "`c(username)'"=="wb371044" {
		global base "C:\Users\\`c(username)'\OneDrive - WBG\Political Connections\"
	}

	global results		"$base\_results"
	global data 		"$base\_data"
	global assembled	"$data\_assembled"
	global dtafile		"C:\Users\wb371044\OneDrive - WBG\EIB-EBRD-WB report, BEEPS-MENA\_data\_MASTER\combined data.dta"
	global mfile		"$assembled\firm_scores_transform_stata.dta"

*Make sure programs are installed
	ssc install outreg2
	ssc install grqreg 
	
*Use main dataset and merge scores
	use "$dtafile" , clear
	
	//there's a little quirk on idstd
	preserve
	use "$mfile" , clear
	d 
	decode idstd, g(str)
	drop idstd
	destring str, force replace
	ren str idstd 
	tempfile scoremerge
	save `scoremerge' 
	restore
	
	merge idstd using `scoremerge'  , sort
	assert _m!=2
	drop _merge
	
*merge in country-level co-variates

	*GDP pc and power shift
	merge idstd using "$assembled\idstd_wCtyCovariates.dta", sort
	assert _m!=2
	drop _merge 
	
	*VDEM
	merge idstd using "$assembled\idstd_wVDEM_GDPpc.dta", sort
	assert _m!=2
	drop _merge 
	
	*CPI (transparency international)
	merge wbcode using "$assembled\CPI.dta"							, sort _merge(cpimerge) uniqusing
	drop if cpimerge==2
	
	*World Governance Indicators
	preserve
	use "C:\Users\wb371044\OneDrive - WBG\Political Connections\_data\_external\wgidataset.dta" , clear
	keep if inlist(year,2017, 2014,2012)
	sort country year
	foreach var in vae pve gee rqe rle cce {
		gen `var'_L5=`var'[_n-2] if countryname==countryname 
		gen `var'_L3=`var'[_n-1] if countryname==countryname 
		gen `var'_L5_diff=`var'-`var'[_n-2] if countryname==countryname 
		gen `var'_L3_diff=`var'-`var'[_n-1] if countryname==countryname 
	}
	
	foreach var in vas pvs ges rqs rls ccs {
		gen `var'_L5_se=`var'[_n-2] if countryname==countryname 
		gen `var'_L3_se=`var'[_n-1] if countryname==countryname 	
	}
	
	foreach stub in va pv ge rq rl cc {
		gen `stub'e_L5_rel=`stub'e_L5_diff/`stub's_L5_se 
		gen `stub'e_L3_rel=`stub'e_L3_diff/`stub's_L3_se 
		gen `stub'e_L5_major=(abs(`stub'e_L5_rel)>1)*sign(`stub'e_L5_rel)	 if `stub'e_L5_rel!=.
		replace `stub'e_L5_major=2 if `stub'e_L5_major==-1 //for factor var
		gen `stub'e_L3_major=(abs(`stub'e_L3_rel)>1)*sign(`stub'e_L3_rel)	 if `stub'e_L5_rel!=.	
		replace `stub'e_L3_major=2 if `stub'e_L3_major==-1 //for factor var
		tab `stub'e_L3_major
	}
	
	keep if year==2017
	rename code wbcode

	replace wbcode="XKX" if wbcode=="KSV"
	replace wbcode="ROU" if wbcode=="ROM"
	replace wbcode="PSE" if wbcode=="WBG"

	tempfile tomerge
	save `tomerge'
	restore

	merge wbcode using `tomerge' , sort _merge(wgimerge) uniqusing
	drop if wgimerge==2
	drop wgimerge
	
	merge wbcode using "$assembled\class_A1 3-15-2021.dta" , sort uniqusing 
	drop if _m==2
	drop _m
		
*Drop funky Morocco data
	drop wt_rs //will recompute
	drop if BMb5 ==. & wbcode=="MAR" & year==2019
	replace wt=BMwmedian if wbcode=="MAR" & year==2019
	bys country: egen wttot=sum(wt)
	gen wt_rs=wt/wttot
	table country, c(sum wt_rs)
	drop wttot
		
*co-variates	
	
	*log age
	gen lnage = ln(car1)
	label var lnage "Log of age of firm"
		
	*log size
	gen lnsize = ln(size_num )
	label var lnsize "Log of firm size"
		
	*manager's experience
	gen manager_exp =  b7 if b7>=0 & b7!=.

	*clean up manager experience
	replace manager_exp=. if manager_exp>60
	label var manager_exp "Top manager experience in sector (years)"
		
	*exporter dummy
	generate exporter_d = 1 if exporter==100
	replace exporter_d = 0 if exporter==0

	label var exporter_d "Direct exports 10% or more of sales Y/N"
	cap label drop YN
	label define YN 1 "Yes" 0 "No"
	label values exporter_d YN
		
	*ownership dummy
	recode ownership (100 = 1), gen(ownership_d)
	label var ownership_d "Foreign ownership  Y/N"
	label values ownership_d YN

	*female manager
	recode gend4 (100=1), gen(femmngr_d)
	
	*bank account
	recode k6 (2=0), gen(bankccnt)
	replace bankccnt =. if bankccnt<0
		
	*ISO
	recode t1 (100=1), gen(t1_d)
	label var t1_d "ISO Certification Ownership Y/N"

	*website
	recode t5 (100=1), gen(t5_d)
	label var t5_d "Website Y/N"
	
	*any gov
	gen anygov=car4>0 if car4!=.

	*wbcode
	encode wbcode, gen(wbcodenm)
		
	*BMB.5 Has the owner, CEO, top manager, or any of the board members of this firm ever been elected or appointed to a political position in this country?
	gen political = 1 if  BMb5==1
	replace political = 0 if BMb5==2
		
	*bank financing dummy
	gen fin7_d =1 if fin7>0 & fin7!=.
	replace fin7_d = 0 if fin7==0
	label var fin7_d "Firm uses bank financing  to finance day-to-day operations"
		
	*end Asif's code
	
	gen ln_size=ln(size_num)
	gen age=year-b5 if !missing(year) & !missing(b5) & year>=0 & b5>=0
	gen ln_age =ln(age)

	recode c22b (2=0)(-9=.), g(website)
	
	recode a7 (2=0)(-9=.),gen(largerfirm)
	
	gen ln_n2a_d2=ln(n2a_d2)
	gen fewcomp=inrange(e2b,0,20)*100 if e2b!=. & e2b!=-9
	gen fewcomp50=inrange(e2b,0,50)*100 if e2b!=. & e2b!=-9
	gen fewcomp10=inrange(e2b,0,10)*100 if e2b!=. & e2b!=-9

	*outcomes
	gen ln_lp=ln(d2_l1)
	gen ln_rev=ln(d2_gdp09)
	gen ln_avglabcost=ln(n2a_gdp09/size_num)

*two-digit sector
	gen _2digit=d1a2/100
	tostring _2digit , replace force
	split _2digit, p(".")	
	destring _2digit1, force replace
	
	
*cluster at a3ax sector_3	
	egen cty_industry=group(country _2digit1)
	egen loc_industry=group(a3ax _2digit1)

*create leave-out averages for firm_score_est
	*gen raw weighted product
	gen firm_score_est_rawwt=firm_score_est*wt 
	gen age_rawwt=car1*wt

	*...by country-industry 
	rangestat (sum) wt firm_score_est_rawwt (count) firm_score_est if !missing(firm_score_est), int(cty_industry 0 0) excludeself
	gen score_cellavg_cty_industry=firm_score_est_rawwt_sum/wt_sum if firm_score_est_count>=5 & !missing(firm_score_est_count)
	drop firm_score_est_rawwt_sum wt_sum firm_score_est_count

	*...by location-industry 
	rangestat (sum) wt firm_score_est_rawwt (count) firm_score_est if !missing(firm_score_est), int(loc_industry 0 0) excludeself
	gen score_cellavg_loc_industry=firm_score_est_rawwt_sum/wt_sum if firm_score_est_count>=5 & !missing(firm_score_est_count)
	drop firm_score_est_rawwt_sum wt_sum firm_score_est_count
	
	*...by location-industry 
	rangestat (sum) wt age_rawwt (count) age_rawwt if !missing(firm_score_est), int(loc_industry 0 0) excludeself
	gen age_cellavg_loc_industry=age_rawwt_sum/wt_sum if age_rawwt_count>=5 & !missing(car1)
	drop age_rawwt_sum age_rawwt_count wt_sum
	
	*...by country-industry 
	rangestat (sum) wt age_rawwt (count) age_rawwt if !missing(firm_score_est), int(cty_industry 0 0) excludeself
	gen age_cellavg_cty_industry=age_rawwt_sum/wt_sum if age_rawwt_count>=5 & !missing(car1)
	drop age_rawwt_sum age_rawwt_count wt_sum
	
	*recode to dummies
	foreach var in BMj3a BMj3b BMj3c fewcomp ln_lp ln_rev ln_n2a_d2 ln_avglabcost fin6 fin7 fin8 fin9 mgmt_agg  {
	    recode `var' (3 4=1)(0 1 2=0),g(`var'_dum)
	
		cap drop `var'_rawwt
		gen `var'_rawwt=`var'_dum*wt
	
		*...by location-industry 
		rangestat (sum) wt `var'_rawwt (count) `var'_rawwt if !missing(`var'_dum), int(loc_industry 0 0) excludeself
		gen `var'_loc_industry=`var'_rawwt_sum/wt_sum if `var'_rawwt_count>=5 & !missing(`var'_dum)
		drop `var'_rawwt_sum `var'_rawwt_count wt_sum
		
		*...by country-industry 
		rangestat (sum) wt `var'_rawwt (count) `var'_rawwt if !missing(`var'_dum), int(cty_industry 0 0) excludeself
		gen `var'_cty_industry=`var'_rawwt_sum/wt_sum if `var'_rawwt_count>=5 & !missing(`var'_dum)
		drop `var'_rawwt_sum `var'_rawwt_count wt_sum
		
	}
	

*Items as indicators
	recode BMb6 (-9 3=.)(1=100)(2=0), gen(assn)
	gen lobby=0 if assn!=. & BMb8d!=-9 //code -7 as a 0
	replace lobby=100 if inlist(BMb8d,3,4)
	
*log gdp pc
	gen ln_gdppc=ln(realGDPpc)
	
*too many competitors
	gen toomany=((e2b==-4) | e2b>=100) if !missing(e2b) & e2b!=-9
	replace toomany=toomany*100
	
*gov contract
	recode j6a (2=0)(-9=.)(1=100), g(govcontract)

*keep only if score has a value
	keep if firm_score_est!=.

*last round 
	gen latest=inlist(year, 2018,2019,2020)
	
*square term of index score
	gen score_sq=firm_score_est^2
	
*MNA variable
	gen mna=(region==5 & wbcode!="MLT")
	
*a3ax as numeric
	*encode a3ax, g(a3ax_num)
	egen a3ax_num=group(a3ax a1)
	
*GLOBALS for code
	global ai_controls 	"i.sector_MS"
	global mna			"if latest & region==5 & wbcode!="MLT""	
 	
*center score variable
	xi: reg firm_score_est i.country [pw=wt_rs]
	predict score_centered, residual //gen residual
	su score_centered [aw=wt_rs]
	global sd=`r(sd)'
	
*determine medians
	gen mergeall=1
	preserve
	collapse (mean) v2x_polyarchy v2x_libdem v2x_partipdem v2x_delibdem v2x_egaldem CPIscore2017 polconiii , by(country)
	drop country
	collapse (median) v2x_polyarchy v2x_libdem v2x_partipdem v2x_delibdem v2x_egaldem CPIscore2017 polconiii 
	foreach var in v2x_polyarchy v2x_libdem v2x_partipdem v2x_delibdem v2x_egaldem CPIscore2017 polconiii {
	    ren `var' p50_`var'
	}
	gen mergeall=1
	tempfile medmerge
	save `medmerge'
	restore
	
	merge mergeall using `medmerge' , sort uniqusing 
	assert _m==3
	drop _m
	
	foreach var in v2x_polyarchy v2x_libdem v2x_partipdem v2x_delibdem v2x_egaldem CPIscore2017 polconiii {
	    gen abovemed_`var'=(`var'>=p50_`var') if !missing(`var')
	}
	 
*recode CPI for above-median="high"
	recode abovemed_CPIscore2017 (0=1)(1=0)
	 
*rename mna to keep structure
	ren mna abovemed_mna	 
	
*For MNA report
	gen mna_comp=abovemed_mna if abovemed_mna==1 | (region==3 & inlist(incomegrp, "Lower middle income", "Upper middle income"))
	gen mna_cat=mna_comp
	replace mna_cat=2 if incomegrp=="Lower middle income" & mna_cat==0
	replace mna_cat=3 if incomegrp=="Upper middle income" & mna_cat==0
		
********** REVERSE CODE *******************
	replace CPIscore2017=100-CPIscore2017 
	foreach var in vae pve gee rqe rle cce {
		gen `var'_rs=100*(-2.5-`var')/(-2.5-2.5)
	}
	
*Keep everythign on the same scale
	replace v2x_polyarchy=v2x_polyarchy*100
	replace polconiii=polconiii*100
	
*Formatting
	graph set window fontface "Times New Roman"
	
*Label variables
	label var CPIscore2017 	"Corruption Perceptions"
	label var polconiii 	"Political Constraints"
	label var vae_rs		"Voice & Accountability"
	label var pve_rs		"Political Stability"
	label var gee_rs		"Government Effectiveness"
	label var rqe_rs		"Regulatory Quality"
	label var rle_rs		"Rule of Law"
	label var cce_rs		"Control of Corruption"
	label var ln_lp			"Labor Productivity"
	label var ln_size		"No. of Workers"
	label var ln_rev		"Total Sales"
	label var reg1			"Time Tax"
	label var toomany		"Many Competitors"
	*label var BMj3a_d		"Parliament"
	*label var BMj3b_d 		"Natl. Govt."
	*label var BMj3c_d		"Local Officials"
	label var fin6			"Internal Funds"
	label var fin7			"Banks"
	label var fin8			"Supplier Credit"
	label var fin9			"Other"
	label var govcontract	"Govt. Contract"
	label var pwrshift_last5yrs	"Power Shift"
	label var fewcomp		"Fewer than 20 competitors"
	label var ln_n2a_d2		"Labor Costs to Sales Ratio (log)"
	label var fewcomp_loc_industry  "Industry-location Avg. with Fewer than 20 competitors"
	label var ln_lp_loc_industry 	"Industry-location Avg. Sales per Worker (log)"
	label var ln_rev_loc_industry 	"Industry-location Avg. Total Sales (log)"
	label var ln_n2a_d2_loc_industry  "Industry-location Avg. Labor Costs to Sales (log)"
	
*Save configured dataset
	save "$assembled\data_for_analysis.dta" , replace